import requests
import time
from bs4 import BeautifulSoup
import os
from lxml import etree


# # 1
# # 获取整个页面数据,将数据html保存
# import urllib.requestz
#
# data = urllib.request.urlopen('http://www.baidu.com')
#
# ret = data.read()
#
# print(ret)
#
# file = open('F:/test/1.html', 'wb')
#
# file.write(ret)
#
# file.close()


# # 2

# url = 'https://www.baidu.com'
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
# }
# params = {'type': 'content', 'q': 'python'}
# res = requests.get(url=url, headers=headers, params=params)
#
# res.encoding = 'utf-8'
# with open('baidu.html', 'w', encoding='utf-8') as f:
#     f.write(res.text)
# print(res)

# # 3
class ChongZi(object):
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
        }
        # self.url = 'https://movie.douban.com/top250'

        self.statnum = []
        for num in range(0, 4):
            self.statnum.append(num)

    def get_top250(self):
        list1 = []
        for i in self.statnum:
            i = str(i)
            url = 'https://www.phb123.com/renwu/fuhao/shishi_%s.html' % i
            html = requests.get(url=url,headers = self.headers)
            soup = BeautifulSoup(html.text)

            # print(soup)

            # 获取电影名称
            # num = soup.select('#content > div > div.article > ol > li:nth-child(1)')
            num = soup.select(
                'body > div.wrap.mar1.clearfix > div.sp-l > div.mar1 > table > tbody')
            # print(len(num))
            for i in num:
                list1.append(i.get_text())
                print(i.get_text())
            # print(num.get_text())
            # with open('fubusi.html','w',encoding='utf-8') as f:
            #     f.write(num)

        # print('ok')
        print(str(list1))

if __name__ == '__main__':
    chongzi = ChongZi()
    # 获取数据
    chongzi.get_top250()
    # 整理数据


# from lxml import etree
# html_str = '''
# <div>
#     <ul>
#          <li class="item1"><a href="link1.html">Python</a></li>
#          <li class="item2"><a href="link2.html">Java</a></li>
#          <li class="site1"><a href="c.biancheng.net">C语言中文网</a>
#          <li class="site2"><a href="www.baidu.com">百度</a></li>
#          <li class="site3"><a href="www.jd.com">京东</a></li>
#      </ul>
# </div>
# '''
# html = etree.HTML(html_str)
#
# # tostring()将标签元素转换为字符串输出，注意：result为字节类型
# result = etree.tostring(html)
#
# print(result.decode('utf-8'))
